In [10]:

    
from preamble import *
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['legend.numpoints'] = 1

Model evaluation and improvement



In [2]:

    
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# create a synthetic dataset
X, y = make_blobs(random_state=0)
# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Instantiate a model and fit it to the training set
logreg = LogisticRegression().fit(X_train, y_train)
# evaluate the model on the test set
logreg.score(X_test, y_test)
# we predicted the correct class on 88% of the samples in X_test









    Out[2]:





0.88

Cross-validation



In [3]:

    
mglearn.plots.plot_cross_validation()

Cross-validation in scikit-learn



In [1]:

    
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
logreg = LogisticRegression()

scores = cross_val_score(logreg, iris.data, iris.target)
print("cross-validation scores: ", scores)









    



cross-validation scores:  [ 0.961  0.922  0.958]



In [2]:

    
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
scores









    Out[2]:





array([ 1.        ,  0.96666667,  0.93333333,  0.9       ,  1.        ])



In [3]:

    
scores.mean()









    Out[3]:





0.96000000000000019

Stratified K-Fold cross-validation and other strategies



In [7]:

    
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.target)









    



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]



In [8]:

    
mglearn.plots.plot_stratified_cross_validation()

More control over cross-validation



In [4]:

    
from sklearn.model_selection import KFold
kfold = KFold(n_folds=5)



In [5]:

    
cross_val_score(logreg, iris.data, iris.target, cv=kfold)









    Out[5]:





array([ 1.        ,  0.93333333,  0.43333333,  0.96666667,  0.43333333])



In [6]:

    
kfold = KFold(n_folds=3)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)









    Out[6]:





array([ 0.,  0.,  0.])



In [7]:

    
kfold = KFold(n_folds=3, shuffle=True, random_state=0)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)









    Out[7]:





array([ 0.9 ,  0.96,  0.96])

Leave-One-Out cross-validation



In [8]:

    
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
print("number of cv iterations: ", len(scores))
print("mean accuracy: ", scores.mean())









    



number of cv iterations:  150
mean accuracy:  0.953333333333

Shuffle-Split cross-validation



In [11]:

    
mglearn.plots.plot_shuffle_split()



In [25]:

    
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_iter=10)
cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)









    Out[25]:





array([ 0.893,  0.947,  0.907,  0.947,  0.92 ,  0.933,  0.88 ,  0.947,
        0.84 ,  0.947])

Cross-validation with groups



In [26]:

    
print("label_kfold")
mglearn.plots.plot_label_kfold()









    



label_kfold



In [27]:

    
from sklearn.model_selection import LabelKFold
from mglearn.datasets import make_blobs
# create synthetic dataset
X, y = make_blobs(n_samples=12, random_state=0)
# assume the first three samples belong to the same group, then the next four etc.
labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
cross_val_score(logreg, X, y, labels, cv=LabelKFold(n_folds=3))









    Out[27]:





array([ 1. ,  0.8,  1. ])

Grid Search

Simple Grid-Search



In [29]:

    
# naive grid search implementation
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print("Size of training set: %d   size of test set: %d" % (X_train.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set 
        score = svm.score(X_test, y_test)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
print("best score: ", best_score)
print("best parameters: ", best_parameters)









    



Size of training set: 112   size of test set: 38
best score:  0.973684210526
best parameters:  {'gamma': 0.001, 'C': 100}



In [30]:

    
best_score









    Out[30]:





0.97368421052631582

The danger of overfitting the parameters and the validation set



In [31]:

    
print("threefold_split")
mglearn.plots.plot_threefold_split()









    



threefold_split



In [32]:

    
from sklearn.svm import SVC
# split data into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0)
# split train+validation set into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)

print("Size of training set: %d   size of validation set: %d   size of test set: %d" % (X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set 
        score = svm.score(X_valid, y_valid)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

# rebuild a model on the combined training and validation set, and evaluate it on the test set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("best score on validation set: ", best_score)
print("best parameters: ", best_parameters)
print("test set score with best parameters: ", test_score)









    



Size of training set: 84   size of validation set: 28   size of test set: 38
best score on validation set:  0.964285714286
best parameters:  {'gamma': 0.001, 'C': 10}
test set score with best parameters:  0.921052631579

Grid-search with cross-validation



In [33]:

    
# reference: manual_grid_search_cv
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        # perform cross-validation
        scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
        # compute mean cross-validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
# rebuild a model on the combined training and validation set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)









    Out[33]:





SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [34]:

    
mglearn.plots.plot_cross_val_selection()



In [35]:

    
mglearn.plots.plot_grid_search_overview()



In [36]:

    
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
param_grid









    Out[36]:





{'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}



In [37]:

    
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5)



In [38]:

    
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)



In [39]:

    
grid_search.fit(X_train, y_train)









    Out[39]:





GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)



In [40]:

    
grid_search.score(X_test, y_test)









    Out[40]:





0.97368421052631582



In [41]:

    
print(grid_search.best_params_)
print(grid_search.best_score_)









    



{'gamma': 0.01, 'C': 100}
0.973214285714



In [42]:

    
grid_search.best_estimator_









    Out[42]:





SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Analyzing the result of cross-validation



In [43]:

    
grid_search.grid_scores_









    Out[43]:





[mean: 0.36607, std: 0.01137, params: {'gamma': 0.001, 'C': 0.001},
 mean: 0.36607, std: 0.01137, params: {'gamma': 0.01, 'C': 0.001},
 mean: 0.36607, std: 0.01137, params: {'gamma': 0.1, 'C': 0.001},
 mean: 0.36607, std: 0.01137, params: {'gamma': 1, 'C': 0.001},
 mean: 0.36607, std: 0.01137, params: {'gamma': 10, 'C': 0.001},
 mean: 0.36607, std: 0.01137, params: {'gamma': 100, 'C': 0.001},
 mean: 0.36607, std: 0.01137, params: {'gamma': 0.001, 'C': 0.01},
 mean: 0.36607, std: 0.01137, params: {'gamma': 0.01, 'C': 0.01},
 mean: 0.36607, std: 0.01137, params: {'gamma': 0.1, 'C': 0.01},
 mean: 0.36607, std: 0.01137, params: {'gamma': 1, 'C': 0.01},
 mean: 0.36607, std: 0.01137, params: {'gamma': 10, 'C': 0.01},
 mean: 0.36607, std: 0.01137, params: {'gamma': 100, 'C': 0.01},
 mean: 0.36607, std: 0.01137, params: {'gamma': 0.001, 'C': 0.1},
 mean: 0.69643, std: 0.01333, params: {'gamma': 0.01, 'C': 0.1},
 mean: 0.91964, std: 0.04442, params: {'gamma': 0.1, 'C': 0.1},
 mean: 0.95536, std: 0.03981, params: {'gamma': 1, 'C': 0.1},
 mean: 0.36607, std: 0.01137, params: {'gamma': 10, 'C': 0.1},
 mean: 0.36607, std: 0.01137, params: {'gamma': 100, 'C': 0.1},
 mean: 0.69643, std: 0.01333, params: {'gamma': 0.001, 'C': 1},
 mean: 0.92857, std: 0.04278, params: {'gamma': 0.01, 'C': 1},
 mean: 0.96429, std: 0.03405, params: {'gamma': 0.1, 'C': 1},
 mean: 0.94643, std: 0.03251, params: {'gamma': 1, 'C': 1},
 mean: 0.91964, std: 0.06507, params: {'gamma': 10, 'C': 1},
 mean: 0.50893, std: 0.04666, params: {'gamma': 100, 'C': 1},
 mean: 0.92857, std: 0.04278, params: {'gamma': 0.001, 'C': 10},
 mean: 0.96429, std: 0.03405, params: {'gamma': 0.01, 'C': 10},
 mean: 0.96429, std: 0.01793, params: {'gamma': 0.1, 'C': 10},
 mean: 0.93750, std: 0.04556, params: {'gamma': 1, 'C': 10},
 mean: 0.91964, std: 0.06507, params: {'gamma': 10, 'C': 10},
 mean: 0.56250, std: 0.04966, params: {'gamma': 100, 'C': 10},
 mean: 0.96429, std: 0.03405, params: {'gamma': 0.001, 'C': 100},
 mean: 0.97321, std: 0.02234, params: {'gamma': 0.01, 'C': 100},
 mean: 0.95536, std: 0.04983, params: {'gamma': 0.1, 'C': 100},
 mean: 0.94643, std: 0.05199, params: {'gamma': 1, 'C': 100},
 mean: 0.91964, std: 0.06507, params: {'gamma': 10, 'C': 100},
 mean: 0.56250, std: 0.04966, params: {'gamma': 100, 'C': 100}]



In [44]:

    
scores = [score.mean_validation_score for score in grid_search.grid_scores_]
scores = np.array(scores).reshape(6, 6)

# plot the mean cross-validation scores
mglearn.tools.heatmap(scores, xlabel='gamma', ylabel='C', xticklabels=param_grid['gamma'],
                      yticklabels=param_grid['C'], cmap="viridis")









    Out[44]:





<matplotlib.collections.PolyCollection at 0x7f89a8579f60>



In [45]:

    
fig, axes = plt.subplots(1, 3, figsize=(13, 5))

param_grid_linear = {'C': np.linspace(1, 2, 6),
                     'gamma':  np.linspace(1, 2, 6)}

param_grid_one_log = {'C': np.linspace(1, 2, 6),
                     'gamma':  np.logspace(-3, 2, 6)}

param_grid_range = {'C': np.logspace(-3, 2, 6),
                     'gamma':  np.logspace(-7, -2, 6)}

for param_grid, ax in zip([param_grid_linear, param_grid_one_log,
                           param_grid_range], axes):
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    scores = [score.mean_validation_score for score in grid_search.grid_scores_]
    scores = np.array(scores).reshape(6, 6)

    # plot the mean cross-validation scores
    scores_image = mglearn.tools.heatmap(scores, xlabel='gamma', ylabel='C', xticklabels=param_grid['gamma'],
                                         yticklabels=param_grid['C'], cmap="viridis", ax=ax)
    
plt.colorbar(scores_image, ax=axes.tolist())
print("gridsearch_failures")









    



gridsearch_failures

Nested cross-validation



In [35]:

    
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())









    



Cross-validation scores:  [ 0.967  1.     0.967  0.967  1.   ]
Mean cross-validation score:  0.98



In [36]:

    
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
    outer_scores = []
    # for each split of the data in the outer cross-validation
    # (split method returns indices)
    for training_samples, test_samples in outer_cv.split(X, y):
        # find best parameter using inner cross-validation:
        best_parms = {}
        best_score = -np.inf
        # iterate over parameters
        for parameters in parameter_grid:
            # accumulate score over inner splits
            cv_scores = []
            # iterate over inner cross-validation
            for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
                # build classifier given parameters and training data
                clf = Classifier(**parameters)
                clf.fit(X[inner_train], y[inner_train])
                # evaluate on inner test set
                score = clf.score(X[inner_test], y[inner_test])
                cv_scores.append(score)
            # compute mean score over inner folds
            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                # if better than so far, remember parameters
                best_score = mean_score
                best_params = parameters
        # build classifier on best parameters using outer training set
        clf = Classifier(**best_params)
        clf.fit(X[training_samples], y[training_samples])
        # evaluate 
        outer_scores.append(clf.score(X[test_samples], y[test_samples]))
    return outer_scores



In [37]:

    
from sklearn.model_selection import ParameterGrid, StratifiedKFold
nested_cv(iris.data, iris.target, StratifiedKFold(5), StratifiedKFold(5), SVC, ParameterGrid(param_grid))









    Out[37]:





[0.96666666666666667, 1.0, 0.96666666666666667, 0.96666666666666667, 1.0]

Parallelizing cross-validation and grid-search

Exercises

Load the bank campaign dataset, and split it into training and test set. Apply grid-search to the training set, searching for the best C for Logistic Regression, also search over L1 penalty vs L2 penalty.



In [ ]:

    
X[::113], y[::113]